{
 "cells": [
  {
   "cell_type": "raw",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 连接数据库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pymysql\n",
    "from sqlalchemy import create_engine\n",
    "con = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/baiduindexdb')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取省份ID与城市ID对照表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\pymysql\\cursors.py:170: Warning: (1366, \"Incorrect string value: '\\\\xD6\\\\xD0\\\\xB9\\\\xFA\\\\xB1\\\\xEA...' for column 'VARIABLE_VALUE' at row 533\")\n",
      "  result = self._query(query)\n"
     ]
    }
   ],
   "source": [
    "prov_id = pd.read_sql_table(\"province_id\",con)\n",
    "city_id = pd.read_sql_table(\"city_id\",con)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取百度指数数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = \"select * from baidu_index\"\n",
    "index = pd.read_sql(sql,con)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "index.replace(to_replace=r'^\\s*$',value=np.nan,regex=True,inplace=True)\n",
    "index = index.dropna()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据字段格式处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "D:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n",
      "D:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "index['date'] = pd.to_datetime(index['date']).dt.strftime('%Y-%m')\n",
    "index['keyword'] = index['keyword'].apply(lambda x: x.strip(' \\r\\n\\t').upper())\n",
    "index['_index'] = index['_index'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_index_mean = index.groupby(['keyword','date'])['_index'].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取省份数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = \"SELECT * FROM `province_index`\"\n",
    "prov_index = pd.read_sql(sql,con)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 字段处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "prov_index['date'] = prov_index['date'].apply(lambda x :x.split(\"|\")[0]) # 取出日期前半部分\n",
    "prov_index['date']=pd.to_datetime(prov_index['date']) # 格式化日期\n",
    "prov_index['prov_index'] = prov_index['prov_index'].astype(int)\n",
    "prov_index['date'] = prov_index['date'].dt.strftime('%Y-%m') # 将日期转换成月份\n",
    "prov_index['keyword'] = prov_index['keyword'].apply(lambda x: x.strip(' \\r\\n\\t').upper())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "prov_index_sum = prov_index.groupby(['keyword','date'])['prov_index'].sum() # 各个车型每月搜索指数求和"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>keyword</th>\n",
       "      <th>date</th>\n",
       "      <th>prov_index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>)</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>43396</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>全托幼儿园</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>20988</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>学前班</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>25617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>寄宿幼儿园</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>15782</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  keyword     date  prov_index\n",
       "0       )  2019-01       43396\n",
       "1   全托幼儿园  2019-01       20988\n",
       "2     学前班  2019-01       25617\n",
       "3   寄宿幼儿园  2019-01       15782"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prov_index_sum = prov_index_sum.reset_index()\n",
    "prov_index_sum.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "prov_index2 = pd.merge(prov_index,prov_index_sum,on=(\"keyword\",\"date\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_index_mean = new_index_mean.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "prov_index2['pct'] = prov_index2['prov_index_x']/prov_index2['prov_index_y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "prov_index_final = pd.merge(prov_index2,new_index_mean,on=('keyword','date'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>keyword</th>\n",
       "      <th>prov</th>\n",
       "      <th>prov_index_x</th>\n",
       "      <th>date</th>\n",
       "      <th>prov_index_y</th>\n",
       "      <th>pct</th>\n",
       "      <th>_index</th>\n",
       "      <th>real_prov_index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>177</td>\n",
       "      <td>全托幼儿园</td>\n",
       "      <td>913</td>\n",
       "      <td>1000</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>20988</td>\n",
       "      <td>0.047646</td>\n",
       "      <td>194.387097</td>\n",
       "      <td>9.261821</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>178</td>\n",
       "      <td>全托幼儿园</td>\n",
       "      <td>917</td>\n",
       "      <td>320</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>20988</td>\n",
       "      <td>0.015247</td>\n",
       "      <td>194.387097</td>\n",
       "      <td>2.963783</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>179</td>\n",
       "      <td>全托幼儿园</td>\n",
       "      <td>910</td>\n",
       "      <td>314</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>20988</td>\n",
       "      <td>0.014961</td>\n",
       "      <td>194.387097</td>\n",
       "      <td>2.908212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>180</td>\n",
       "      <td>全托幼儿园</td>\n",
       "      <td>901</td>\n",
       "      <td>238</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>20988</td>\n",
       "      <td>0.011340</td>\n",
       "      <td>194.387097</td>\n",
       "      <td>2.204313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>181</td>\n",
       "      <td>全托幼儿园</td>\n",
       "      <td>911</td>\n",
       "      <td>220</td>\n",
       "      <td>2019-01</td>\n",
       "      <td>20988</td>\n",
       "      <td>0.010482</td>\n",
       "      <td>194.387097</td>\n",
       "      <td>2.037601</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id keyword  prov  prov_index_x     date  prov_index_y       pct  \\\n",
       "0  177   全托幼儿园   913          1000  2019-01         20988  0.047646   \n",
       "1  178   全托幼儿园   917           320  2019-01         20988  0.015247   \n",
       "2  179   全托幼儿园   910           314  2019-01         20988  0.014961   \n",
       "3  180   全托幼儿园   901           238  2019-01         20988  0.011340   \n",
       "4  181   全托幼儿园   911           220  2019-01         20988  0.010482   \n",
       "\n",
       "       _index  real_prov_index  \n",
       "0  194.387097         9.261821  \n",
       "1  194.387097         2.963783  \n",
       "2  194.387097         2.908212  \n",
       "3  194.387097         2.204313  \n",
       "4  194.387097         2.037601  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prov_index_final['real_prov_index'] = prov_index_final['pct'] * prov_index_final['_index']\n",
    "prov_index_final.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "prov_index_final = pd.merge(prov_index_final,prov_id,left_on=\"prov\",right_on=\"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "prov_index_final.to_excel('real_prov.xlsx')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 根据省份处理城市"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = \"SELECT * FROM `city_index`\"\n",
    "city_index = pd.read_sql(sql,con)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 将时间分开，格式化，转成月份"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "city_index['date'] = city_index['date'].apply(lambda x :x.split(\"|\")[0])\n",
    "city_index['date']=pd.to_datetime(city_index['date'])\n",
    "city_index['date'] = city_index['date'].dt.strftime('%B') # 将日期转换成月份"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'prov'",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-46-a77136a5ebbb>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mcity_index\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'keyword'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcity_index\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'keyword'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m' \\r\\n\\t'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mcity_index_sum\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcity_index\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'keyword'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'date'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'prov'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'city_index'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mcity_index_sum\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcity_index_sum\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mgroupby\u001b[1;34m(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, **kwargs)\u001b[0m\n\u001b[0;32m   7894\u001b[0m             \u001b[0msqueeze\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   7895\u001b[0m             \u001b[0mobserved\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mobserved\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 7896\u001b[1;33m             \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   7897\u001b[0m         )\n\u001b[0;32m   7898\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36mgroupby\u001b[1;34m(obj, by, **kwds)\u001b[0m\n\u001b[0;32m   2476\u001b[0m         \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"invalid type: {}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2478\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mklass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, **kwargs)\u001b[0m\n\u001b[0;32m    389\u001b[0m                 \u001b[0msort\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msort\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    390\u001b[0m                 \u001b[0mobserved\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mobserved\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 391\u001b[1;33m                 \u001b[0mmutated\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmutated\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    392\u001b[0m             )\n\u001b[0;32m    393\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\envs\\Python3.6\\lib\\site-packages\\pandas\\core\\groupby\\grouper.py\u001b[0m in \u001b[0;36m_get_grouper\u001b[1;34m(obj, key, axis, level, sort, observed, mutated, validate)\u001b[0m\n\u001b[0;32m    619\u001b[0m                 \u001b[0min_axis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    620\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 621\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    622\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mGrouper\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mgpr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkey\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    623\u001b[0m             \u001b[1;31m# Add key to exclusions\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'prov'"
     ],
     "output_type": "error"
    }
   ],
   "source": [
    "city_index['keyword'] = city_index['keyword'].apply(lambda x: x.strip(' \\r\\n\\t').upper())\n",
    "city_index_sum = city_index.groupby(['keyword','date','prov'])['city_index'].sum()\n",
    "city_index_sum = city_index_sum.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "city_index2 = pd.merge(city_index,city_index_sum,on=(\"keyword\",\"date\",\"prov\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "city_index2['pct'] = city_index2['city_index_x']/city_index2['city_index_y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "city_index_final = pd.merge(city_index2,prov_index_final,on=('keyword','prov','date'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "city_index_final[\"real_city_index\"] = city_index_final['pct_x']*city_index_final['real_prov_index']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "city_index_final = pd.merge(city_index_final,city_id,left_on='city',right_on=\"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "city_index_final.to_excel('city_index_final.xlsx')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
